# %%

import pkg_resources
import subprocess

def remove_package(package):
    subprocess.check_call(['pip', 'uninstall', package, "--yes" ])

def install_package(package, version):
    subprocess.check_call(['pip', 'install', f"{package}=={version}", "--user"])

def check_and_install_packages(package_dict):
    for package, version in package_dict.items():
        try:
            pkg_resources.get_distribution(package)
            print(f"{package} is already installed. Removing and reinstalling...")
            remove_package(package)
        except pkg_resources.DistributionNotFound:
            print(f"{package} is not installed.")
        
        install_package(package, version)
        print(f"{package} installed successfully.")

# Dictionary of packages and their versions
package_dict = {
    "Rtree": "1.0.1",
    "seaborn": "0.12.2",
    "pathlib": "1.0.1",
    "geopandas": "0.13.2",
    "contextily": "1.3.0",
    "shapely": "2.0.1",
    "numpy": "1.24.0",
    "pandas": "2.0.3",
    "Pillow": "10.0.0",
}
# Check and install packages if necessary
check_and_install_packages(package_dict)


import os, sys, glob
import re

from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
font = {'family' : 'IBM Plex Sans',
        'weight' : 'normal',
        'size'   : 10}
plt.rc('font', **font)


# parallel
from multiprocessing import Pool
from joblib import Parallel, delayed

# geo
import geopandas as gpd
import rtree
from shapely.geometry import shape, mapping, Point, LinearRing

# Get the current file path
file_path = os.path.abspath(__file__)

# Get the directory containing the file
directory = os.path.dirname(file_path)

# Set the working directory to the file's directory
os.chdir(directory)

# %%


# %%
import os, sys, glob
import re

from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
font = {'family' : 'IBM Plex Sans',
        'weight' : 'normal',
        'size'   : 10}
plt.rc('font', **font)


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# parallel
from multiprocessing import Pool
from joblib import Parallel, delayed

# geo
import geopandas as gpd
import rtree
from shapely.geometry import shape, mapping, Point, LinearRing

# %%


df = pd.read_stata('atlas_clean.dta')
df.info()

# %%
df.head()

# %%
(df[['metal_class', 'locality']].groupby('locality')
    .count().sort_values(by = 'metal_class', ascending = False)
)

# %%
dropminerals = ['limestone', 'fire clay', 'clay', 'quartz glass/silica sand',
                'mica', 'ochre', 'talc', 'gypsum', 'salt']
df2 = df.loc[~df.mineral.isin(dropminerals)]

# %%
df.shape
df2.shape

# %%
df2.metal_class.value_counts()
df2.mineral.value_counts()

# %% [markdown]
# ## Convert to Geodata

# %%
mines = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
mines.mineral.nunique()

# %%
top20 = mines.mineral.value_counts().nlargest(20).index

# %% [markdown]
# # Maps

# %%
states = gpd.read_file( "IND_adm1.shp")

# %%
f, ax = plt.subplots(1,figsize = (10, 12), dpi = 200)
states.plot(facecolor = 'None',linewidth = 0.5, edgecolor = 'k', ax = ax)
mines.loc[mines.mineral.isin(top20)].plot(column = 'mineral', categorical = True,
            markersize = 0.8 , ax = ax,
            legend = True, cmap = 'tab20',
            legend_kwds = {
                'loc': 'lower right', 'markerscale': 0.3, 'ncol': 2,
                'prop': {'size': 10}})
ax.set_axis_off()
ax.set_title("Locations of Mines in India \n 20 most common minerals \n State borders overlaid")
f.savefig( 'appendix_figureA5.pdf' )